from __future__ import division
from scipy import c_, ones, dot, stats, diff
from scipy.linalg import inv, solve, det
from numpy import log, pi, sqrt, square, diagonal
from numpy.random import randn, seed
import time

class ols:
	"""
	Author: Vincent Nijs (+ ?)

	Email: v-nijs at kellogg.northwestern.edu

	Last Modified: Mon Jan 15 17:56:17 CST 2007
	
	Dependencies: See import statement at the top of this file

	Doc: Class for multi-variate regression using OLS

	For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
	simply run this file using 'python ols.py'. This will generate some simulated data and run various analyses. If you have rpy installed
	the same model will also be estimated by R for confirmation.

	Input:
		y = dependent variable
		y_varnm = string with the variable label for y
		x = independent variables, note that a constant is added by default
		x_varnm = string or list of variable labels for the independent variables
	
	Output:
		There are no values returned by the class. Summary provides printed output.
		All other measures can be accessed as follows:

		Step 1: Create an OLS instance by passing data to the class

			m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])

		Step 2: Get specific metrics

			To print the coefficients: 
				>>> print m.b
			To print the coefficients p-values: 
				>>> print m.p
	
	"""

	def __init__(self,y,x,y_varnm = 'y',x_varnm = ''):
		"""
		Initializing the ols class. 
		"""
		self.y = y
		self.x = c_[ones(x.shape[0]),x]
		self.y_varnm = y_varnm
		if not isinstance(x_varnm,list): 
			self.x_varnm = ['const'] + list(x_varnm)
		else:
			self.x_varnm = ['const'] + x_varnm

		# Estimate model using OLS
		self.estimate()

	def estimate(self):

		# estimating coefficients, and basic stats
		self.inv_xx = inv(dot(self.x.T,self.x))
		xy = dot(self.x.T,self.y)
		self.b = dot(self.inv_xx,xy)					# estimate coefficients

		self.nobs = self.y.shape[0]						# number of observations
		self.ncoef = self.x.shape[1]					# number of coef.
		self.df_e = self.nobs - self.ncoef				# degrees of freedom, error 
		self.df_r = self.ncoef - 1						# degrees of freedom, regression 

		self.e = self.y - dot(self.x,self.b)			# residuals
		self.sse = dot(self.e,self.e)/self.df_e			# SSE
		self.se = sqrt(diagonal(self.sse*self.inv_xx))	# coef. standard errors
		self.t = self.b / self.se						# coef. t-statistics
		self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2	# coef. p-values

		self.R2 = 1 - self.e.var()/self.y.var()			# model R-squared
		self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef))	# adjusted R-square

		self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e)	# model F-statistic
		self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e)	# F-statistic p-value

	def dw(self):
		"""
		Calculates the Durbin-Waston statistic
		"""
		de = diff(self.e,1)
		dw = dot(de,de) / dot(self.e,self.e);

		return dw

	def omni(self):
		"""
		Omnibus test for normality
		"""
		return stats.normaltest(self.e) 
	
	def JB(self):
		"""
		Calculate residual skewness, kurtosis, and do the JB test for normality
		"""

		# Calculate residual skewness and kurtosis
		skew = stats.skew(self.e) 
		kurtosis = 3 + stats.kurtosis(self.e) 
		
		# Calculate the Jarque-Bera test for normality
		JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
		JBpv = 1-stats.chi2.cdf(JB,2);

		return JB, JBpv, kurtosis, skew

	def ll(self):
		"""
		Calculate model log-likelihood and two information criteria
		"""
		
		# Model log-likelihood, AIC, and BIC criterion values 
		ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
		aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
		bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs

		return ll, aic, bic
	
	def summary(self):
		"""
		Printing model output to screen
		"""

		# local time & date
		t = time.localtime()

		# extra stats
		ll, aic, bic = self.ll()
		JB, JBpv, skew, kurtosis = self.JB()
		omni, omnipv = self.omni()

		# printing output to screen
		print '\n=============================================================================='
		print "Dependent Variable: " + self.y_varnm
		print "Method: Least Squares"
		print "Date: ", time.strftime("%a, %d %b %Y",t)
		print "Time: ", time.strftime("%H:%M:%S",t)
		print '# obs:				%5.0f' % self.nobs
		print '# variables:		%5.0f' % self.ncoef 
		print '=============================================================================='
		print 'variable		coefficient		std. Error		t-statistic		prob.'
		print '=============================================================================='
		for i in range(len(self.x_varnm)):
			print '''% -5s			% -5.6f		% -5.6f		% -5.6f		% -5.6f''' % tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]]) 
		print '=============================================================================='
		print 'Models stats							Residual stats'
		print '=============================================================================='
		print 'R-squared			% -5.6f			Durbin-Watson stat	% -5.6f' % tuple([self.R2, self.dw()])
		print 'Adjusted R-squared	% -5.6f			Omnibus stat		% -5.6f' % tuple([self.R2adj, omni])
		print 'F-statistic			% -5.6f			Prob(Omnibus stat)	% -5.6f' % tuple([self.F, omnipv])
		print 'Prob (F-statistic)	% -5.6f			JB stat				% -5.6f' % tuple([self.Fpv, JB])
		print 'Log likelihood		% -5.6f			Prob(JB)			% -5.6f' % tuple([ll, JBpv])
		print 'AIC criterion		% -5.6f			Skew				% -5.6f' % tuple([aic, skew])
		print 'BIC criterion		% -5.6f			Kurtosis			% -5.6f' % tuple([bic, kurtosis])
		print '=============================================================================='

if __name__ == '__main__':

	##########################
	### testing the ols class
	##########################

	# creating simulated data and variable labels
	seed(1)
	data =	randn(100,5)			# the data array

	# intercept is added, by default
	m = ols(data[:,0],data[:,1:],y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])
	m.summary()

	# if you have rpy installed, use it to test the results
	have_rpy =	False
	try:
		print "\n"
		print "="*30
		print "Validating OLS results in R"
		print "="*30

		import rpy
		have_rpy = True
	except ImportError:
		print "\n"
		print "="*30
		print "Validating OLS-class results in R"
		print "="*30
		print "rpy is not installed"
		print "="*30

	if have_rpy:
		y = data[:,0]
		x1 = data[:,1]
		x2 = data[:,2]
		x3 = data[:,3]
		x4 = data[:,4]
		rpy.set_default_mode(rpy.NO_CONVERSION)
		linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y))
		rpy.set_default_mode(rpy.BASIC_CONVERSION)
		print linear_model.as_py()['coefficients']
		summary = rpy.r.summary(linear_model)
		print summary
